library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages ------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.0 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.4
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.6.3
## -- Conflicts ---------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(forecast)
## Warning: package 'forecast' was built under R version 3.6.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
suppressPackageStartupMessages(library(lubridate))
suppressPackageStartupMessages(library(fpp))
## Warning: package 'fpp' was built under R version 3.6.3
## Warning: package 'fma' was built under R version 3.6.3
## Warning: package 'expsmooth' was built under R version 3.6.3
## Warning: package 'lmtest' was built under R version 3.6.3
## Warning: package 'zoo' was built under R version 3.6.3
## Warning: package 'tseries' was built under R version 3.6.3
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(hrbrthemes))
## Warning: package 'hrbrthemes' was built under R version 3.6.3
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(viridis))
## Warning: package 'viridis' was built under R version 3.6.3
suppressPackageStartupMessages(library(plotly))
## Warning: package 'plotly' was built under R version 3.6.3
suppressPackageStartupMessages(library(gapminder))
## Warning: package 'gapminder' was built under R version 3.6.3
suppressPackageStartupMessages(library(htmlwidgets))
## Warning: package 'htmlwidgets' was built under R version 3.6.3
suppressPackageStartupMessages(library(emojifont))
## Warning: package 'emojifont' was built under R version 3.6.3
suppressPackageStartupMessages(library(wordcloud2))
## Warning: package 'wordcloud2' was built under R version 3.6.3
options(warn = -1)
setwd("C:/Users/skakar/Desktop/PERSONAL/Blog")
#Import Data and Convert to Text
whatsapp_txt <- readLines("WhatsApp Chat with Mom.txt")
wa_data = as.data.frame(whatsapp_txt)
wa_data = wa_data %>% rename (Texts = whatsapp_txt)
# Add Index to the Document
wa_data <- tibble::rowid_to_column(wa_data, "ID")
# Regex
dates = ""
times = ""
sender = ""
message = ""
wa_data$Texts = as.character(wa_data$Texts)
# Loop uses Regex to get Data from .TXT file
for ( i in 1:nrow(wa_data) ){
hasText = str_extract(wa_data[i,2], "(.*):(.*)")
hasDate = str_extract(wa_data[i,2], "\\d+/\\d+/\\d+")
if(!is_empty(hasText) && !is.na(hasDate))
{
dates[i] = str_extract(wa_data[i,2], "\\d+/\\d+/\\d+")
times[i] = str_extract(wa_data[i,2], "\\d+:\\d(.*)[AM]")
sender[i] = str_remove(substring(str_extract(wa_data[i,2], "[AM - ](.*)[:]"), 12), ":" )
message[i] = unlist(strsplit(str_extract(wa_data[i,2], ":(.*)"), ": "))[2]
}
else
{
wa_data = wa_data[-c(i),]
}
}
# Append Data to Dataset
wa_data$Date = dates
wa_data$Time = times
wa_data$Time = trimws(str_trunc(wa_data$Time, 8, "right", ellipsis = ""))
wa_data$Sender = trimws(sender)
wa_data$Message = message
# Convert to Date Type
wa_data$Datetime <- paste(wa_data$Date, wa_data$Time)
wa_data$Datetime <- as.POSIXct(wa_data$Datetime, format = "%m/%d/%Y %I:%M %p", tz = "GMT")
year(wa_data$Datetime) <- year(wa_data$Datetime) + 2000
wa_data$Year = year(wa_data$Datetime)
wa_data$Month = as.factor(months(wa_data$Datetime))
wa_data$Month = factor(wa_data$Month,levels=c("January","February","March", "April","May","June","July","August","September", "October","November","December"),ordered=TRUE)
# MetaData
str(wa_data)
## 'data.frame': 4323 obs. of 9 variables:
## $ ID : int 1 2 3 4 6 8 9 10 11 12 ...
## $ Texts : chr "7/22/17, 4:05 PM - Messages to this chat and calls are now secured with end-to-end encryption. Tap for more info." "7/26/17, 8:45 AM - Mom: Bread, butter, eggs, oil, potatoes,salt,sugar,cereals,glass,Knife,fruits,vegetables,onions," "7/26/17, 11:02 AM - Mom: <Media omitted>" "7/26/17, 11:04 AM - Mom: Rohit Rana" ...
## $ Date : chr "7/22/17" "7/26/17" "7/26/17" "7/26/17" ...
## $ Time : chr "4:05 PM" "8:45 AM" "11:02 AM" "11:04 AM" ...
## $ Sender : chr "" "Mom" "Mom" "Mom" ...
## $ Message : chr NA "Bread, butter, eggs, oil, potatoes,salt,sugar,cereals,glass,Knife,fruits,vegetables,onions," "<Media omitted>" "Rohit Rana" ...
## $ Datetime: POSIXct, format: "2017-07-22 16:05:00" "2017-07-26 08:45:00" ...
## $ Year : num 2017 2017 2017 2017 NA ...
## $ Month : Ord.factor w/ 12 levels "January"<"February"<..: 7 7 7 7 NA NA 7 7 7 8 ...
# DATA
head(wa_data)
## ID
## 1 1
## 2 2
## 3 3
## 4 4
## 6 6
## 8 8
## Texts
## 1 7/22/17, 4:05 PM - Messages to this chat and calls are now secured with end-to-end encryption. Tap for more info.
## 2 7/26/17, 8:45 AM - Mom: Bread, butter, eggs, oil, potatoes,salt,sugar,cereals,glass,Knife,fruits,vegetables,onions,
## 3 7/26/17, 11:02 AM - Mom: <Media omitted>
## 4 7/26/17, 11:04 AM - Mom: Rohit Rana
## 6 Overland Park, KS 66223
## 8 7/26/17, 11:04 AM - Mom: This is rohit fanaa address
## Date Time Sender
## 1 7/22/17 4:05 PM
## 2 7/26/17 8:45 AM Mom
## 3 7/26/17 11:02 AM Mom
## 4 7/26/17 11:04 AM Mom
## 6 <NA> <NA> <NA>
## 8 <NA> <NA> <NA>
## Message
## 1 <NA>
## 2 Bread, butter, eggs, oil, potatoes,salt,sugar,cereals,glass,Knife,fruits,vegetables,onions,
## 3 <Media omitted>
## 4 Rohit Rana
## 6 <NA>
## 8 <NA>
## Datetime Year Month
## 1 2017-07-22 16:05:00 2017 July
## 2 2017-07-26 08:45:00 2017 July
## 3 2017-07-26 11:02:00 2017 July
## 4 2017-07-26 11:04:00 2017 July
## 6 <NA> NA <NA>
## 8 <NA> NA <NA>
# Data Analysis
print(mean(sapply(strsplit(wa_data$Message, " "), length))) # Average Number of Words per message
## [1] 3.960675
# Chat Distribution
barplot(table(subset(wa_data$Sender, wa_data$Sender %in% as.array(c("Mom","Sunny Kakar")))), main="Chat Distribution",
xlab="Senders", ylab="Message Count", col = c("#56B4E9", "#009E73"))

# Chat Consistency Through Time
PlotData <- function(data, yearid){
year_data = data %>% filter( year(Datetime) == yearid )
message = paste("Message Distribution: ",as.character(yearid))
year_data %>%
ggplot( aes(x=Datetime)) +
geom_density(fill="#69b3a2", color="#e9ecef", alpha=0.8) +
ggtitle(message) +
theme_ipsum()
}
PlotData(wa_data, "2017")

PlotData(wa_data, "2018")

PlotData(wa_data, "2019")

PlotData(wa_data, "2020")

# Conversation
plot2 = ggplot(data=subset(wa_data, !is.na(Month)), aes(x=Month, group=Year, fill=Year)) +
ggtitle("Message Count Density (Monthly)") +
geom_histogram(stat="count") +
theme_ipsum()
ggplotly(plot2)
# Scatter Plot
texts_me = table(format(subset(wa_data$Datetime, wa_data$Sender %in% as.array(c("Sunny Kakar"))),"%Y-%m"))
scatter = as.data.frame(texts_me)
scatter = scatter %>% rename (Text_Frequency_Me = Freq)
scatter = scatter %>% rename (MYTime = Var1)
scatter$Text_Frequency_Them = table(format(subset(wa_data$Datetime, wa_data$Sender %in% as.array(c("Mom"))),"%Y-%m"))
ggplot(scatter, aes(x=scatter$Text_Frequency_Me, y=scatter$Text_Frequency_Them)) +
geom_point() +
geom_smooth(method=lm , color="green", se=FALSE) +
geom_rug(col="steelblue",alpha=0.1, size=1.5) +
theme_ipsum() +
labs(x = "My Text Frequency", y = "Their Text Frequency", title = "Scatterplot (Corr. b/w Texting Frequency)")
## Don't know how to automatically pick scale for object of type table. Defaulting to continuous.
## `geom_smooth()` using formula 'y ~ x'

# Most Common Texts sent
# -- My Top Texts
my_top_texts = as.data.frame(sort(table(subset(wa_data$Message, wa_data$Sender %in% as.array(c("Sunny Kakar")))), decreasing=T)[1:10])
my_top_texts
## Var1 Freq
## 1 <Media omitted> 123
## 2 Hanji 65
## 3 ðŸ‘\215ðŸ\217¼ 29
## 4 Ok 27
## 5 Good morning Mom 18
## 6 Mom 18
## 7 Yep 18
## 8 Okay 15
## 9 Morning 10
## 10 Okay. 10
# -- Their Top Texts
their_top_texts = as.data.frame(sort(table(subset(wa_data$Message, wa_data$Sender %in% as.array(c("Mom")))), decreasing=T)[1:10])
their_top_texts
## Var1 Freq
## 1 Missed video call 647
## 2 Missed voice call 253
## 3 <Media omitted> 225
## 4 Ok 93
## 5 Good Morning beta 71
## 6 ðŸ‘\215 55
## 7 ðŸ\230\230 23
## 8 R u busy 19
## 9 Where r u 19
## 10 R u in class 17
wordcloud2(data=sort(table(subset(wa_data$Message, wa_data$Sender %in% as.array(c("Sunny Kakar")))), decreasing=T)[1:50], size=1.6)
wordcloud2(data=sort(table(subset(wa_data$Message, wa_data$Sender %in% as.array(c("Mom")))), decreasing=T)[1:50], size=10)
print("Thank you!")
## [1] "Thank you!"